This notebook is all about benchmarking some R code used in this package.
Hardware / Software used:
-O2 -Wall $(DEBUGFLAG) -mtune=core2 (R’s defaults)library(data.table)
data.table 1.10.4
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
library(microbenchmark)
library(Rcpp)
library(ggplot2)
library(plotly)
Attaching package: <U+393C><U+3E31>plotly<U+393C><U+3E32>
The following object is masked from <U+393C><U+3E31>package:ggplot2<U+393C><U+3E32>:
last_plot
The following object is masked from <U+393C><U+3E31>package:stats<U+393C><U+3E32>:
filter
The following object is masked from <U+393C><U+3E31>package:graphics<U+393C><U+3E32>:
layout
# Helper function to print data well in tables
print_well <- function(data, digits = 6) {
# To milliseconds
data <- data / 1000000
# Sprintf helper
sprintf_helper <- paste0("%.0", digits, "f")
cat("| Min | 25% | 50% | 75% | Max | Mean | \n| --: | --: | --: | --: | --: | --: | \n| ", sprintf(sprintf_helper, min(data)), " | ", sprintf(sprintf_helper, quantile(data, probs = 0.25)), " | ", sprintf(sprintf_helper, median(data)), " | ", sprintf(sprintf_helper, quantile(data, probs = 0.75)), " | ", sprintf(sprintf_helper, max(data)), " | ", sprintf(sprintf_helper, mean(data)), " | \n", sep = "")
return(data)
}
# Test case function
# Arguments renamed to avoid recursive clash
test_case <- function(f, preds, labels) {
cat("Test case: ", paste(do.call(f, list(preds = preds[1:50],
labels = labels[1:5])), collapse = ", "), " \n", sep = "")
}
For a 10-class vector of 1,000,000 observations:
A = [1:1, 1:2, 1:3, 1:4... 1:10, 2:1, 2:2, 2:3..., 1000000:8, 1000000:9, 1000000:10]
B = [3, 5, 9, 1, 4, 8, 6, ...]
Get the following Vector C:
C = [1:4, 2:6, 3:10, 4:2, 5:5, 6:9, 7:7, ...]
# Generate random data
set.seed(11111)
data <- runif(10000000, 0, 1)
labels <- round(runif(1000000, 0, 9), digits = 0)
# Background truth example
array(data[1:50], dim = c(10, 5))
[,1] [,2] [,3] [,4] [,5]
[1,] 0.5014483 0.57219649 0.70236924 0.06440384 0.78924978
[2,] 0.9702328 0.34292525 0.50889166 0.65780657 0.72449914
[3,] 0.7876004 0.09627503 0.20268701 0.19973930 0.00886554
[4,] 0.9022259 0.74235690 0.90706612 0.24664551 0.57348710
[5,] 0.8141778 0.42274539 0.05441064 0.25997440 0.73034459
[6,] 0.7998922 0.98402494 0.09045308 0.14869691 0.42458612
[7,] 0.1158690 0.89258437 0.85759800 0.21375685 0.07684022
[8,] 0.7171363 0.59541565 0.94780036 0.20598170 0.53318628
[9,] 0.1020639 0.80531234 0.16479666 0.02075780 0.28521238
[10,] 0.6856938 0.42102379 0.42536097 0.63880218 0.86408083
labels[1:5]
[1] 5 1 1 7 8
data[c(1 + labels[1], 11 + labels[2], 21 + labels[3], 31 + labels[4], 41 + labels[5])]
[1] 0.7998922 0.3429253 0.5088917 0.2059817 0.2852124
# How many digits for benchmarking in milliseconds
my_digits <- 6L
# How many runs for benchmarking?
my_runs <- 1000L
# ===== BLOCK 1 =====
faster1 <- function(preds, labels) {
data_len <- length(preds)
label_len <- length(labels)
len_div <- data_len / label_len
dims <- c(len_div, label_len)
label_long <- labels + 1
mat_out <- cbind(label_long, 1:label_len)
vec_out <- array(preds, dim = dims)[mat_out]
return(vec_out)
}
test_case(faster1, preds = data, labels = labels)
Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233
data1 <- print_well(microbenchmark(faster1(data, labels), times = my_runs)$time, digits = my_digits)
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 71.564443 | 75.506931 | 80.075883 | 88.667152 | 271.422576 | 91.143632 |
# ===== BLOCK 2 =====
faster2 <- function(preds, labels) {
data_len <- length(preds)
label_len <- length(labels)
vec_out <- array(preds, dim = c(data_len / label_len, label_len))[cbind(labels + 1, 1:label_len)]
return(vec_out)
}
test_case(faster2, preds = data, labels = labels)
Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233
data2 <- print_well(microbenchmark(faster2(data, labels), times = my_runs)$time, digits = my_digits)
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 67.731910 | 77.624099 | 78.828656 | 82.082812 | 171.282154 | 85.530228 |
# ===== BLOCK 3 =====
faster3 <- function(preds, labels) {
return(array(preds, dim = c(length(preds) / length(labels), length(labels)))[cbind(labels + 1, 1:length(labels))])
}
test_case(faster3, preds = data, labels = labels)
Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233
data3 <- print_well(microbenchmark(faster3(data, labels), times = my_runs)$time, digits = my_digits)
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 66.323125 | 77.211462 | 78.444528 | 82.270790 | 179.514764 | 82.474732 |
# ===== BLOCK 4 =====
faster4 <- function(preds, labels) {
return(array(preds, dim = c(length(preds) / length(labels), length(labels)))[matrix(c(labels + 1, 1:length(labels)), ncol = 2)])
}
test_case(faster4, preds = data, labels = labels)
Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233
data4 <- print_well(microbenchmark(faster4(data, labels), times = my_runs)$time, digits = my_digits)
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 78.247998 | 86.402015 | 87.825909 | 92.492082 | 189.528312 | 92.330280 |
# ===== BLOCK 5 =====
faster5 <- function(preds, labels) {
return(preds[((0:(length(labels) - 1)) * (length(preds) / length(labels))) + labels + 1])
}
test_case(faster5, preds = data, labels = labels)
Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233
data5 <- print_well(microbenchmark(faster5(data, labels), times = my_runs)$time, digits = my_digits)
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 23.923492 | 24.552428 | 24.876114 | 29.834516 | 125.909856 | 28.465982 |
# ===== BLOCK 6 =====
cppFunction("NumericVector faster6(NumericVector preds, NumericVector labels) {
int labels_size = labels.size();
NumericVector selected(labels_size);
selected = (preds.size() / labels_size) * seq(0, labels_size - 1);
selected = selected + labels;
NumericVector to_return(labels_size);
to_return = preds[selected];
return to_return;
}")
test_case(faster6, preds = data, labels = labels)
Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233
data6 <- print_well(microbenchmark(faster6(data, labels), times = my_runs)$time, digits = my_digits)
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 21.314618 | 21.961134 | 22.393063 | 29.491252 | 115.326864 | 26.078263 |
data_time <- data.table(rbindlist(list(data.frame(Time = data1, Bench = "faster1"),
data.frame(Time = data2, Bench = "faster2"),
data.frame(Time = data3, Bench = "faster3"),
data.frame(Time = data4, Bench = "faster4"),
data.frame(Time = data5, Bench = "faster5"),
data.frame(Time = data6, Bench = "faster6"))))
data_time <- data_time[, t_mean := mean(Time), by = Bench]
data_time <- data_time[, t_median := median(Time), by = Bench]
data_time$Benchs <- data_time$Bench
levels(data_time$Benchs) <- paste0("faster", 1:6, "= [", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(min(Time)), by = Bench]$V1), ", ", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(max(Time)), by = Bench]$V1), "], mean=", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(mean(Time)), by = Bench]$V1), ", median=", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(median(Time)), by = Bench]$V1))
my_time <- data_time[, list(min(Time), quantile(Time, probs = 0.25), median(Time), quantile(Time, probs = 0.75), max(Time), mean(Time)), by = Bench]
colnames(my_time) <- c("Function", "Min", "25%", "50%", "75%", "Max", "Mean")
my_time <- my_time[order(Mean, decreasing = FALSE), ]
print(my_time, digits = 6)
ggplotly(ggplot(data = data_time, aes(x = Time)) + geom_histogram(aes(y = ..density..), bins = 20, color = "darkblue", fill = "lightblue") + facet_wrap(~ Benchs, ncol = 2) + geom_vline(aes(xintercept = t_mean), color = "blue", linetype = "dashed", size = 2) + geom_vline(aes(xintercept = t_median), color = "red", linetype = "dashed", size = 2) + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
ggplotly(ggplot(data = data_time[, .(Time, Bench)], aes(x = Time, y = ..count.., fill = Bench)) + geom_histogram(aes(y = ..density..), bins = 100, position = "fill") + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
ggplotly(ggplot(data = data_time, aes(x = Time, y = ..count.., fill = Bench)) + geom_density(position = "fill") + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
Benchmarker <- function(f, size, runs, digits, name) {
data_runs <- list()
for (i in 1:length(size)) {
set.seed(11111)
data <- runif(size[i] * 10, 0, 1)
labels <- round(runif(size[i], 0, 9), digits = 0)
cat(" \n \n## ", name, " run: ",format(size[i], big.mark = ",", scientific = FALSE), " samples (", format(runs[i], big.mark = ",", scientific = FALSE), " times) \n \n", sep = "")
test_case(f, preds = data, labels = labels)
cat(" \n")
data_runs[[i]] <- print_well(microbenchmark(f(data, labels), times = runs[i])$time, digits = digits)
data_runs[[i]] <- data.table(Bench = as.factor(paste0("[", i, "] ", format(size[i], big.mark = ",", scientific = FALSE))), Function = as.factor(name), Time = data_runs[[i]])
gc(verbose = FALSE)
}
return(data_runs)
}
bench_size <- c(100, 1000, 10000, 100000, 1000000, 10000000)
bench_runs <- c(50000, 10000, 5000, 1000, 500, 100)
run1 <- Benchmarker(faster5, bench_size, bench_runs, my_digits, "Pure R")
Test case: 0.902225864585489, 0.342925252625719, 0.94780036341399, 0.259974401211366, 0.533186275744811
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 0.004181 | 0.004942 | 0.005322 | 0.005702 | 8.467534 | 0.006380 |
Test case: 0.902225864585489, 0.572196492459625, 0.054410636657849, 0.657806565286592, 0.72449914435856
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 0.016346 | 0.022048 | 0.033072 | 0.039535 | 6.349415 | 0.031636 |
Test case: 0.102063933620229, 0.805312335956842, 0.508891655597836, 0.199739297153428, 0.730344592127949
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 0.131527 | 0.134569 | 0.136849 | 0.152054 | 2.930470 | 0.156778 |
Test case: 0.902225864585489, 0.98402493679896, 0.857597996480763, 0.24664551159367, 0.573487095767632
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 1.747106 | 1.819237 | 1.893459 | 2.007689 | 7.361718 | 2.037394 |
Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 18.687877 | 19.686780 | 20.140377 | 21.708439 | 113.789213 | 21.546394 |
Test case: 0.71713633579202, 0.422745394520462, 0.508891655597836, 0.638802175875753, 0.72449914435856
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 226.084871 | 239.779753 | 253.655959 | 293.223768 | 416.498519 | 267.676075 |
run2 <- Benchmarker(faster6, bench_size, bench_runs, my_digits, "Rcpp")
Test case: 0.902225864585489, 0.342925252625719, 0.94780036341399, 0.259974401211366, 0.533186275744811
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 0.001520 | 0.002281 | 0.002661 | 0.003041 | 9.970592 | 0.003234 |
Test case: 0.902225864585489, 0.572196492459625, 0.054410636657849, 0.657806565286592, 0.72449914435856
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 0.009123 | 0.009884 | 0.011025 | 0.015206 | 0.150914 | 0.013138 |
Test case: 0.102063933620229, 0.805312335956842, 0.508891655597836, 0.199739297153428, 0.730344592127949
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 0.090852 | 0.094274 | 0.099215 | 0.118602 | 3.754226 | 0.114964 |
Test case: 0.902225864585489, 0.98402493679896, 0.857597996480763, 0.24664551159367, 0.573487095767632
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 1.369251 | 1.450125 | 1.509141 | 1.848318 | 11.098076 | 1.716435 |
Test case: 0.799892218317837, 0.342925252625719, 0.508891655597836, 0.205981696723029, 0.285212381277233
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 16.072539 | 16.425306 | 16.746901 | 18.450767 | 95.100196 | 17.832287 |
Test case: 0.71713633579202, 0.422745394520462, 0.508891655597836, 0.638802175875753, 0.72449914435856
| Min | 25% | 50% | 75% | Max | Mean |
|---|---|---|---|---|---|
| 193.767971 | 211.404771 | 223.412703 | 240.699492 | 343.565967 | 230.416580 |
run1_all <- rbindlist(run1)
run2_all <- rbindlist(run2)
run_all <- rbind(run1_all, run2_all)
run_all$Repeats <- rep(inverse.rle(list(lengths = bench_runs, values = bench_size)), 2)
run_all$MilObs <- (1000 / run_all$Time) * run_all$Repeats / 1000000
run_time <- run_all[, list(quantile(Time, probs = 0.05), median(Time), quantile(Time, probs = 0.95), mean(Time)), by = list(Function, Bench)]
colnames(run_time) <- c("Function", "Benchmark", "5%", "50%", "95%", "Mean")
run_time$`Mil.Obs/s` <- (1000 / run_time$Mean) * bench_size / 1000000
run_time$`5%` <- format(run_time$`5%`, digits = 6, scientific = FALSE)
run_time$`50%` <- format(run_time$`50%`, digits = 6, scientific = FALSE)
run_time$`95%` <- format(run_time$`95%`, digits = 6, scientific = FALSE)
run_time$Mean <- format(run_time$Mean, digits = 6, scientific = FALSE)
run_time$`Mil.Obs/s` <- format(run_time$`Mil.Obs/s`, digits = 6, scientific = FALSE)
print(run_time[1:(nrow(run_time) / 2)])
print(run_time[(nrow(run_time) / 2 + 1):nrow(run_time)])
ggplot(run_all, aes(x = Bench, y = Time, color = Function, fill = Bench)) + geom_boxplot() + scale_y_log10(labels = scales::comma, breaks = c(0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000)) + stat_summary(fun.y = mean, geom = "line", aes(group = Function)) + stat_summary(fun.y = mean, geom = "point", aes(group = Function)) + labs(x = "Benchmark", y = "Time (Milliseconds)") + theme_bw()
ggplot(run_all, aes(x = Bench, y = MilObs, color = Function, fill = Bench)) + geom_boxplot() + scale_y_log10(labels = scales::comma, breaks = c(1, 10, 25, 50, 75, 100, 125, 150, 175, 200), limits = c(1, NA)) + stat_summary(fun.y = mean, geom = "line", aes(group = Function)) + stat_summary(fun.y = mean, geom = "point", aes(group = Function)) + labs(x = "Benchmark", y = "Throughput (Million Obs./s)") + theme_bw()
ggplot(data = run_all, aes(x = MilObs, color = Function, fill = Function, group = Function)) + coord_flip() + stat_ecdf(aes(ymin=..y.., ymax = 1), alpha = 0.5, geom = "ribbon") + stat_ecdf(geom = "line", size = 2, alpha = 0.75, pad = FALSE) + labs(x = "Throughput (Million Obs./s)", y = "Percentile") + facet_wrap(~ Bench, dir = "h", ncol = 2, scales = "free") + theme_bw()